######################################
# Media Measurement Matters          #
# Replication Code                   #
# Appendix - Descriptives (App. E-H) #
######################################

# The following file contains code for replicating the descriptive analyses in the
# supplemental appendix. These results are reported in Appendices E, F, G, and H in
# the online supplement.

# Set-Up ----

# If desired, set up path into which to save plots and tables
plot_path <- NULL
table_path <- NULL

# Load packages
library(tidyverse)
library(ggridges)
library(ggalluvial)
library(ggpubr)
library(estimatr)
library(overlap)
library(spatstat)
library(xtable)

# Set up helper operations
`%notin%` <- Negate(`%in%`)

# Set up colors
red_mit = '#A31F34'
red_light = '#A9606C'
blue_mit = '#315485'
grey_light= '#C2C0BF'
grey_dark = '#8A8B8C'
black = '#353132'

# Source helper functions
source("helper_functions.R")

# > Read in data ----

# Read in survey data
srvy <- read_rds("data/survey_data_cleaned.rds")

# Read in web data and remove portal sites
web <- read_rds("data/web_use.rds")

# Filter out portal sites
web_noport <- web %>% 
  filter(domain_recode %notin% c("aol.com", "msn.com", "google.com"))

# Appendix E: Most Popular Websites ----

# Sort matched news domains by number of visits in our data, incl. portals
matched_visits <- web %>% 
  drop_na(avg_align) %>% 
  group_by(domain_recode) %>% 
  tally() %>% 
  arrange(desc(n))

# Preview the top 25 most-visited domains
head(matched_visits, 25) %>% as.data.frame()

# Calculate the proportion of visits to (matched) news domains that were concentrated
# in the top 25 domains
visits25 <- sum(matched_visits[1:25,] %>% pull(n))/sum(matched_visits$n)
visits25 # 79% of news visits were concentrated in top 25 most-visited domains

# Plot the number/percent of visits to each of the top 25 domains (Figure E1)
match_top25 <- matched_visits[1:25,] %>% 
  mutate(pct = (n/sum(matched_visits$n)) * 100)

(e1 <- ggplot(match_top25, aes(x = domain_recode, y = n)) + 
  geom_bar(stat = "identity") + 
  theme_bw() + 
  ylab("Domain Visits") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5)) + 
  scale_fill_gradient(low = "blue", high = "red", name = "Align Score") +
  geom_text(aes(label = paste0(round(pct, 0), "%")), vjust = -0.25, size = 3.5) + 
  scale_x_discrete(limits=match_top25$domain_recode) +
  scale_y_continuous(limits = c(0, 175000), expand = expansion(0,0)) + 
  theme(legend.position = "none",
        plot.title = element_blank(),
        plot.subtitle = element_text(hjust = 0.5, face = "italic", size = 12),
        axis.title.x = element_blank(),
        axis.title.y = element_text(margin = unit(c(0, 3, 0, 0), "mm"), 
                                    size = 12),
        legend.title = element_text(hjust = 0.5),
        axis.text.x = element_text(size = 10, color = "black"),
        axis.text.y = element_text(size = 10, color = "black"))) 

ggsave(e1, path = plot_path, filename = "fig_e1.pdf",
       height = 6, width = 9, dpi = 600)

# Appendix F: Relative Volume of News vs. Non-News ----

# > Average volume of news vs. non-news across groups ----

# Average news consumption by partisanship
srvy %>% 
  drop_na(news_consump, pid) %>% 
  group_by(pid) %>% 
  summarise(mean_consump = mean(news_consump))

# Average news consumption by ideology
srvy %>% 
  drop_na(news_consump, ideo) %>% 
  group_by(ideo) %>% 
  summarise(mean_consump = mean(news_consump))

# Average news consumption by stated preferences
srvy %>% 
  drop_na(news_consump, med_pref) %>% 
  group_by(med_pref) %>% 
  summarise(mean_consump = mean(news_consump))

# Average news consumption by stated news consumption
# days_bin = whether respondents say they consume news daily (1) or less often (0)
mean(srvy$days_bin) # 63% of respondents say they consume news daily

srvy %>% 
  drop_na(news_consump, days_bin) %>% 
  group_by(days_bin) %>% 
  summarise(mean_consump = mean(news_consump))

# Identify how many days during the pre-study period respondents visited at least one news URL
days_visited <- web %>% 
  group_by(resp_id) %>% 
  summarise(n_days = length(unique(date))) 

srvy <- srvy %>% 
  left_join(days_visited, by = "resp_id") %>% 
  mutate(days_prop = n_days/31)

# Calculate the proportion of respondents who consumed news on >90% of days
srvy %>% 
  drop_na(days_prop, days_bin) %>% 
  group_by(days_bin) %>% 
  summarise(days = mean(days_prop > .9))

# > Distributions of relative volume across groups ----

# Figure F1: plot the distribution of relative volume scores by PID, ideology,
# and stated media preferences
f1_a <- score_plots(var = "pid", var_labels = c("Democrats", "Independents", "Republicans"),
                    var_levels = c(-1, 0, 1), x_var = "news_consump",
                    x_label = "Proportion of Visits to News Domains",
                    y_label = "Party ID", by = 0.25, height = 1.85, 
                    x_limits = c(-0.1, 1.05))
ggsave(f1_a, path = plot_path, filename = "fig_f1_a.pdf",
       height = 3, width = 4.5, dpi = 600)

f1_b <- score_plots(var = "ideo", var_labels = c("Liberals", "Moderates", "Conservatives"),
                    var_levels = c(-1, 0, 1), x_var = "news_consump",
                    x_label = "Proportion of Visits to News Domains",
                    y_label = "Ideology", by = 0.25, height = 1.85, 
                    x_limits = c(-0.1, 1.05))
ggsave(f1_b, path = plot_path, filename = "fig_f1_b.pdf",
       height = 3, width = 4.5, dpi = 600)

f1_c <- score_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                     "Prefer\nEntertainment", 
                                                     "Prefer\nFox"),
                    var_levels = c("MSNBC", "Entertainment", "Fox"), x_var = "news_consump",
                    x_label = "Proportion of Visits to News Domains",
                    y_label = "Stated Media Preference", by = 0.25, height = 1.85, 
                    x_limits = c(-0.1, 1.05))
ggsave(f1_c, path = plot_path, filename = "fig_f1_c.pdf",
       height = 3, width = 4.5, dpi = 600)

# Figure F2: plot the distributions of (a) relative volume, (b) proportion of days
# visited at least one news site, disaggregated by stated news consumption
f2_a <- score_plots(var = "days_bin", 
                    var_levels = 0:1,
                    var_labels = c("Lower News\nConsumption\n(Less than Daily)",
                                   "Higher News\nConsumption\n(At Least Daily)"),
                    x_var = "news_consump", x_label = "Proportion of Visits to News Domains",
                    y_label = "Stated News Consumption", by = 0.25, colors = rep(grey_dark, 2),
                    height = 1.85, x_limits = c(-0.1, 1.05))
ggsave(f2_a, path = plot_path, filename = "fig_f2_a.pdf",
       height = 3, width = 4.5, dpi = 600)

f2_b <- score_plots(var = "days_bin", 
                    var_levels = 0:1,
                    var_labels = c("Lower News\nConsumption\n(Less than Daily)",
                                   "Higher News\nConsumption\n(At Least Daily)"),
                    x_var = "days_prop", x_label = "Proportion of Days Visited News Domains",
                    y_label = "Stated News Consumption", by = 0.25,
                    colors = rep(grey_dark, 2), height = 1.85)
ggsave(f2_b, path = plot_path, filename = "fig_f2_b.pdf",
       height = 3, width = 4.5, dpi = 600)

# > Regression models ----

# Run OLS models predicting relative volume based on stated news consumption, as well as
# party ID, ideology, or stated media preferences. All models also control for gender (male vs. female),
# race (white vs. non-white), age (continuous), education (college degree vs. no college degree),
# political knowledge (based on a battery of 5 items), and political interest.
# Age, political knowledge, and interest in national news are all standardized.

# Figure F3: coefficients from OLS models predicting news consumption
(f3_a <- panel_plot(form_comp = "(pid==1) + (pid==-1)", dv = "news_consump",
                    label1 = "Republican", label2 = "Democrat",
                    title = "By Partisanship", scale = "Prop. News Consumption",
                    y_low = -.1, y_high = 0.1, by = 0.05))
(f3_b <- panel_plot(form_comp = "(ideo==1) + (ideo==-1)", dv = "news_consump",
                    label1 = "Conservative", label2 = "Liberal",
                    title = "By Ideology", scale = "Prop. News Consumption",
                    y_low = -.1, y_high = 0.1, by = 0.05))
(f3_c <- panel_plot(form_comp = "factor(med_pref)", dv = "news_consump",
                    label1 = "Prefer Fox", label2 = "Prefer MSNBC",
                    title = "By Stated Media Preference", scale = "Prop. News Consumption",
                    y_low = -.1, y_high = 0.1, by = 0.05))

# Combine plots
f3 <- ggarrange(f3_a, f3_b, f3_c, ncol = 3)

f3 <- print(annotate_figure(f3, 
                            bottom = text_grob(expression(paste(bold("Estimated Coefficient "), 
                                                                bolditalic("(Lower Scores = Lower News Consumption)"))),
                                               size = 12, face = "bold")))

ggsave(f3, path = plot_path, filename = "fig_f3.pdf",
       width = 16, height = 10, dpi = 600)

# Appendix G: Relative Slant of News Consumption ----

# > Distributions of relative slant across groups ----

# Figure G1: plot the distribution of relative slant scores by PID, ideology,
# and stated media preferences
g1_a <- score_plots(var = "pid", var_labels = c("Democrats", "Independents", "Republicans"),
                    var_levels = c(-1, 0, 1), x_var = "score_noportals",
                    x_label = "Respondent Avg. Slant Score\n(Lower Values = More Liberal)",
                    y_label = "Party ID")
ggsave(g1_a, path = plot_path, filename = "fig_g1_a.pdf",
       height = 3, width = 4.5, dpi = 600)

g1_b <- score_plots(var = "ideo", var_labels = c("Liberals", "Moderates", "Conservatives"),
                    var_levels = c(-1, 0, 1), x_var = "score_noportals",
                    x_label = "Respondent Avg. Slant Score\n(Lower Values = More Liberal)",
                    y_label = "Ideology")
ggsave(g1_b, path = plot_path, filename = "fig_g1_b.pdf",
       height = 3, width = 4.5, dpi = 600)

g1_c <- score_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                     "Prefer\nEntertainment", 
                                                     "Prefer\nFox"),
                    var_levels = c("MSNBC", "Entertainment", "Fox"), 
                    x_var = "score_noportals",
                    x_label = "Respondent Avg. Slant Score\n(Lower Values = More Liberal)",
                    y_label = "Stated Media Preference")
ggsave(g1_c, path = plot_path, filename = "fig_g1_c.pdf",
       height = 3, width = 4.5, dpi = 600)

# Figure G2: plot the distribution of relative slant scores by PID, ideology,
# and stated media preferences -- for site visits vs. respondents
g2_a <- visit_plots(var = "pid", var_labels = c("Democrats", "Independents", "Republicans"),
                    var_levels = c(-1, 0, 1), 
                    x_label = "Relative Slant of News Visits",
                    y_label = "Party ID", weights = FALSE, df = web_noport, 
                    exemplars = c("cnn.com","news.yahoo.com","msnbc.com",
                                  "foxnews.com","nytimes.com"))
ggsave(g2_a, path = plot_path, filename = "fig_g2_a.pdf", 
       height=3.5, width=8, dpi = 600)

g2_b <- visit_plots(var = "ideo", var_labels = c("Liberals", "Moderates", "Conservatives"),
                    var_levels = c(-1, 0, 1), 
                    x_label = "Relative Slant of News Visits",
                    y_label = "Ideology", weights = FALSE, df = web_noport, 
                    exemplars = c("cnn.com","news.yahoo.com","msnbc.com",
                                  "foxnews.com","nytimes.com"))
ggsave(g2_b, path = plot_path, filename = "fig_g2_b.pdf", 
       height=3.5, width=8, dpi = 600)

# > Overlapping coefficients ----

# Overlap for respondent-level scores
overlap_pid <- score_overlap(var = "score_noportals", group_var = "pid", values = c(-1, 0, 1),
                             out_contrasts = c("Democrat vs. Republican",
                                               "Democrat vs. Independent",
                                               "Republican vs. Independent"))
overlap_ideo <- score_overlap(var = "score_noportals", group_var = "ideo", values = c(-1, 0, 1),
                              out_contrasts = c("Liberal vs. Conservative",
                                                "Liberal vs. Moderate",
                                                "Conservative vs. Moderate"))


overlap_pid$`Democrat vs. Republican`
overlap_ideo$`Liberal vs. Conservative`

# Overlap for visit-level scores
visit_overlap_pid <- visit_overlap(group_var = "pid", values = c(-1, 0, 1),
                                   df = web_noport,
                                   out_contrasts = c("Democrat vs. Republican",
                                                     "Democrat vs. Independent",
                                                     "Republican vs. Independent"))
visit_overlap_ideo <- visit_overlap(group_var = "ideo", values = c(-1, 0, 1),
                                    df = web_noport,
                                    out_contrasts = c("Liberal vs. Conservative",
                                                      "Liberal vs. Moderate",
                                                      "Conservative vs. Moderate"))

visit_overlap_pid$`Democrat vs. Republican`
visit_overlap_ideo$`Liberal vs. Conservative`

# > Regression models ----

# Run OLS models predicting relative slant based on stated news consumption, as well as
# party ID, ideology, or stated media preferences. All models also control for gender (male vs. female),
# race (white vs. non-white), age (continuous), education (college degree vs. no college degree),
# political knowledge (based on a battery of 5 items), and political interest.
# Age, political knowledge, and interest in national news are all standardized.

# Figure G3: coefficients from OLS models predicting respondent-level alignment scores
(g3_a <- panel_plot(form_comp = "(pid==1) + (pid==-1)",
                    label1 = "Republican", label2 = "Democrat",
                    title = "By Partisanship"))
(g3_b <- panel_plot(form_comp = "(ideo==1) + (ideo==-1)",
                    label1 = "Conservative", label2 = "Liberal",
                    title = "By Ideology"))
(g3_c <- panel_plot(form_comp = "factor(med_pref)",
                    label1 = "Prefer Fox", label2 = "Prefer MSNBC",
                    title = "By Stated Media Preference"))

g3 <- ggarrange(g3_a, g3_b, g3_c, ncol = 3)

g3 <- print(annotate_figure(g3, 
                            bottom = text_grob(expression(paste(bold("Estimated Coefficient "), 
                                                                bolditalic("(Lower Scores = More Liberal)"))),
                                               size = 12, face = "bold")))

ggsave(g3, path = plot_path, filename = "fig_g3.pdf",
       width = 16, height = 10, dpi = 600)

# > Scatterplots ----

theme_set(theme_bw() + 
            theme(plot.title = element_text(hjust = 0.5, size = 16),
                  plot.subtitle = element_text(hjust = 0.5, face = "italic", size = 12),
                  axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm"),
                                              size = 12, angle = 0, hjust = 0.5),
                  axis.title.y = element_text(margin = unit(c(0, 3, 0, 0), "mm"), 
                                              size = 12),
                  legend.title = element_text(hjust = 0.5, face = "bold", size = 12),
                  legend.text = element_text(size = 10),
                  axis.text.x = element_text(size = 10, color = "black"),
                  axis.text.y = element_text(size = 10, color = "black"),
                  legend.position = "bottom",
                  legend.box = "vertical",
                  legend.background = element_blank(),
                  legend.box.background = element_rect(colour = "black")))

# Figure G4(a): scatterplot of relative volume vs. relative slant (excl. portals)
(g4_a <- ggplot(srvy %>% filter(!is.na(med_pref)), 
       aes(x = news_consump, y = score_noportals,
           col = factor(med_pref, levels = c("MSNBC",
                                             "Entertainment", 
                                             "Fox")), 
           shape = factor(med_pref, levels = c("MSNBC",
                                               "Entertainment", 
                                               "Fox")))) + 
  geom_point(alpha = 0.9) + 
  scale_color_grey("Stated Media Preference", start = 0.8, end = 0.3,
                   labels = c("MSNBC", "Entertainment", "Fox")) + 
  scale_shape_discrete("Stated Media Preference",
                       labels = c("MSNBC", "Entertainment", "Fox")) +
  xlab("Relative Volume of News vs. Non-News") + 
  ylab("Relative Slant of News Consumption") +
  guides(colour = guide_legend(override.aes = list(size=2.5))))

ggsave(g4_a, filename = "fig_g4_a.pdf",
       path = plot_path, width = 7, height = 4, dpi = 600)

# Figure G4(b): scatterplot of relative volume vs. relative extremey (excl. portals)
# Extremity is calculated as the absolute value of individuals' alignment scores, as
# these scores tend to range from -1 to 1
(g4_b <- ggplot(srvy %>% filter(!is.na(med_pref)), 
       aes(x = news_consump, y = abs(score_noportals),
           col = factor(med_pref, levels = c("MSNBC",
                                             "Entertainment", 
                                             "Fox")), 
           shape = factor(med_pref, levels = c("MSNBC",
                                               "Entertainment", 
                                               "Fox")))) + 
  geom_point(alpha = 0.9) + 
  scale_color_grey("Stated Media Preference", start = 0.8, end = 0.3,
                   labels = c("MSNBC", "Entertainment", "Fox")) + 
  scale_shape_discrete("Stated Media Preference",
                       labels = c("MSNBC", "Entertainment", "Fox")) +
  xlab("Relative Volume of News vs. Non-News") + 
  ylab("Extremity of News Consumption") +
  guides(colour = guide_legend(override.aes = list(size=2.5))))

ggsave(g4_b, filename = "fig_g4_b.pdf",
       path = plot_path, width = 7, height = 4, dpi = 600)

# Figure G5: scatter plot of mean and variance of revealed slant measure (excl. portals)
(g5 <- ggplot(srvy %>% filter(!is.na(med_pref)), 
       aes(x = score_noportals, y = score_var_noportals,
           col = factor(med_pref, levels = c("MSNBC",
                                             "Entertainment", 
                                             "Fox")), 
           shape = factor(med_pref, levels = c("MSNBC",
                                               "Entertainment", 
                                               "Fox")))) + 
  geom_point(alpha = 0.9) + 
  scale_color_grey("Stated Media Preference", start = 0.8, end = 0.3,
                   labels = c("MSNBC", "Entertainment", "Fox")) + 
  scale_shape_discrete("Stated Media Preference",
                       labels = c("MSNBC", "Entertainment", "Fox")) +
  xlab("Mean Alignment Score Across Visits") + 
  ylab("Variance of Alignment\nScores Across Visits") +
  guides(colour = guide_legend(override.aes = list(size=2.5))))

ggsave(filename = "fig_g5.pdf",
       path = plot_path, width = 7, height = 4, dpi = 600)

# Appendix H: Discrepancies Between Stated and Revealed Preferences ----

# > Cross-tabs of stated/revealed preferences ----

# Define group labels
srvy <- srvy %>% 
  mutate(med_label = factor(med_pref, levels = c("MSNBC", "Entertainment", "Fox")),
         code_label = factor(score_code, levels = 1:3, 
                             labels = c(c("More Liberal Than CNN",
                                          "Between CNN and Yahoo!",
                                          "More Conserv. Than Yahoo!"))))

srvy_free <- srvy %>% 
  filter(forcedchoice == 0)

# Table H1: comparison between stated and revealed preferences in full sample
comp_full <- table(srvy$med_label, srvy$code_label)
sum(comp_full) # n = 3350

# Table H2: comparison between stated and revealed preferences in free-choice group
comp_free <- table(srvy_free$med_label, srvy_free$code_label)
sum(comp_free) # n = 1679

# Format row names for these tables
bold <- function(x) {paste('{\\textbf{',x,'}}', sep ='')}
rownames(comp_full) <- bold(rownames(comp_full))
rownames(comp_free) <- bold(rownames(comp_free))

# Save output for Table H1
print(xtable(comp_full, 
             align = c("l", rep("c", 3)), digits = 1,
             caption = "Alignment of stated and revealed preferences in full sample ($n = 3350$), using coding based on exemplar sites.",
             label = "cross_tab_full"), 
      sanitize.colnames.function=bold, sanitize.text.function=identity,
      booktabs = T, scalebox = 0.8, 
      file = paste0(table_path, "/tab_h1.tex"))

# Save output for Table H2
print(xtable(comp_free,
             align = c("l", rep("c", 3)), digits = 1,
             caption = "Alignment of stated and revealed preferences among respondents in the free-choice group ($n = 1679$), using coding based on exemplar sites.",
             label = "cross_tab_free"), 
      sanitize.colnames.function=bold, sanitize.text.function=identity,
      booktabs = T, scalebox = 0.8,
      file = paste0(table_path, "/tab_h2.tex"))

# > Regression models ----

# Define empirical CDFs for MSNBC and Fox News preferrers
percentile_msnbc <- ecdf(srvy %>% filter(med_pref == "MSNBC") %>% 
                           select(score_noportals) %>% pull())
percentile_fox <- ecdf(srvy %>% filter(med_pref == "Fox") %>% 
                         select(score_noportals) %>% pull())

# Define percentiles for each respondent, where higher scores indicate that
# a respondent is more "out of step" (conservative for MSNBC, liberal for Fox) 
# compared to others within their stated preference group
srvy <- srvy %>% 
  mutate(perc_msnbc = percentile_msnbc(score_noportals),
         perc_fox = percentile_fox(score_noportals),
         pref_mod = case_when(med_pref == "MSNBC" ~ perc_msnbc,
                              med_pref == "Fox" ~ 1 - perc_fox))

# Plot coefficients from each regression. Note that respondents who state
# a preference for entertainment are excluded in all cases
(h1_a <- panel_mod(form_comp = "(pid==1) + (pid==-1)",
                      label1 = "Republican", label2 = "Democrat",
                      title = "By Partisanship"))
(h1_b <- panel_mod(form_comp = "(ideo==1) + (ideo==-1)", 
                       label1 = "Conservative", label2 = "Liberal",
                       title = "By Ideology"))
(h1_c <- panel_mod(form_comp = "factor(med_pref)", 
                       label1 = "Prefer MSNBC", 
                       title = "By Stated Media Preference"))

h1 <- ggarrange(h1_a, h1_b, h1_c, ncol = 3)

h1 <- print(annotate_figure(h1, 
                            bottom = text_grob(expression(paste(bold("Estimated Coefficient "), 
                                                                bolditalic("(Lower Scores = Less Divergent Preferences)"))),
                                               size = 12, face = "bold")))

ggsave(h1, path = plot_path, filename = "fig_h1.pdf",
       width = 16, height = 10, dpi = 600)
